package com.zhen.tika.demo;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

/**
 * Created with IntelliJ IDEA.
 * User: zhen-desktop
 * Date: 2019/1/8
 * Time: 22:09
 */
public class TikaParsePdf {

    public static void main(String[] args) throws IOException, TikaException, SAXException {
        String filePath = "chapter2/TikaDemo/files/中国人工智能大会CCAI 2016圆满落幕.pdf";
        File pdfFile = new File(filePath);
//        创建内容处理器对象
        BodyContentHandler handler = new BodyContentHandler();
//        创建元数据对象
        Metadata metadata = new Metadata();
        FileInputStream inputStream = new FileInputStream(pdfFile);
//        创建内容解析器对象
        ParseContext parseContext = new ParseContext();
//        实例化PDFParser对象
        PDFParser parser = new PDFParser();
        parser.parse(inputStream,handler,metadata,parseContext);
        System.out.println("文件属性信息：");
        for(String name : metadata.names()){
            System.out.println(name + ":" + metadata.get(name));
        }
        System.out.println("pdf文件中的内容：");
        System.out.println(handler.toString());
    }

}
